## R version 3.3.3 (2017-03-06)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Debian GNU/Linux 8 (jessie)
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] backports_1.1.2 magrittr_1.5    rprojroot_1.3-2 tools_3.3.3    
##  [5] htmltools_0.3.6 Rcpp_0.12.15    stringi_1.1.6   rmarkdown_1.8  
##  [9] knitr_1.20      stringr_1.3.0   digest_0.6.15   evaluate_0.10.1

Our Apache Yarn cluster hosts the flights data representing 123 million flights over 22 years. Read the lecture notes on how to access the Yarn cluster. Connect to the database using sparklyr and answer following questions. You can base your answers on a specific year or the whole data set.

  1. Map the top 10 busiest airports. Size of dots should reflect the number of flights through that destination.
    Hint: You may find this tutorial on Making Maps in R helpful.
    #Map the top 10 busiest airports. Size of dots should reflect the number of 
    #flights through that destination.

    # Determine the n count of flight origins
    originCount <- flights_tbl %>%
    filter(year == 1991) %>%
    group_by(origin) %>%
    tally() %>%
    arrange(desc(n)) %>%
    collect() %>%
    head(15)
    # Determine the n count of flight destinations
    departureCount <- flights_tbl %>%
    filter(year == 1991) %>%
    group_by(dest) %>%
    tally() %>%
    arrange(desc(n)) %>%
    collect() %>%
    head(15)

   colnames(originCount) <- c("origin", "outgoing")
   colnames(departureCount) <- c("destination", "incoming")

  # Determine the "busiest" airports by summing incoming and outcoming flights.
  flightCount <-originCount %>%
  inner_join(departureCount, by = c("origin" = "destination")) %>%
  transmute(airport = origin, totalFlights = incoming + outgoing) %>%
  arrange(desc(totalFlights)) %>%
  head(10)

  # Location Retrieval.
topTenLocations <- flightCount %>%
  inner_join(airports_tbl, by = c("airport" = "faa"), copy = TRUE) %>%
  select(airport, totalFlights, lon, lat) %>%
  mutate(lat = as.double(lat),
         lon = as.double(lon),
         proportion = totalFlights/sum(totalFlights)) %>%
  collect()

  # Create compressed RDS  file.
#library(tidyverse)
##write_rds(topTenLocations, "hw4/busiestAirports.rds")

  # Create a plot of the US MAP.

usmap <- map_data("usa")
states <- map_data("state")

map <- get_map(location = "united states", zoom = 4)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=united+states&zoom=4&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=united%20states&sensor=false
busyUSports <- ggmap(map) +
  geom_point(data = topTenLocations,
             aes(x = lon, y = lat, size = totalFlights),
                 color = "black") +
  geom_label_repel(data = topTenLocations,
                   aes(x= lon, y = lat, label =  airport
                      ), min.segment.length = 0, size = 3.5) +
  ggtitle("Busiest US Airports in 1991")
busyUSports

  1. Map the top 10 busiest direct routes. Size of lines should reflect the number of flights through that route.
#Map the top 10 busiest direct routes.
#Size of lines should reflect the number of flights through that route.

topRoutes <-flights_tbl %>%
  filter(year == 2002) %>%
  group_by(origin, dest) %>%
  summarize(n = n()) %>%
  arrange(desc(n)) %>%
  collect() %>%
  head(10)

# Retrieve latitude and longitide of origin and dest

topTenOrigin<- topRoutes %>%
  inner_join(airports_tbl, by = c("origin" = "faa"), copy = TRUE) %>%
  select(origin, lat, lon, name,n) %>%
  mutate(latO = as.double(lat),
         lonO = as.double(lon)) %>%
  collect()

topTenDest <- topRoutes %>%
  inner_join(airports_tbl, by = c("dest" = "faa"), copy = TRUE) %>%
  select(origin, lat, lon, name,n) %>%
  mutate(latD = as.double(lat),
         lonD = as.double(lon)) %>%
  collect()

q2 <- bind_cols(topTenOrigin, topTenDest)

q2 <- q2 %>%
  mutate(scale = (n / max(n)) ^ 10)

 #  For the labels, we want to remove duplicate origins so that its not cluttered: 3 different ways to do so:
q2Sliced <- q2 %>%
  group_by(origin) %>%
  filter(row_number() == 1)

q2Sliced2 <- q2 %>%
  .[c(1:6, 8),]

q2Sliced3 <- q2 %>%
  slice(c(1:6,8))

### google map
map <- get_map(location = "united states", zoom = 4)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=united+states&zoom=4&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=united%20states&sensor=false
ggmap(map) +
  geom_point(data = q2,
             aes(x = lonO, y = latO),
             colour = "red", alpha = 0.2) +
  geom_point(data = q2,
             aes(x = lonD, y = latD),
             colour = "red", alpha = 0.2)+
  geom_curve(data = q2,
             aes(x = lonO, y = latO, xend = lonD, yend = latD),    
             arrow = arrow(angle = 15, ends = "first", length = unit(0.2, "cm"), type = "closed"),
             alpha = 0.7, curvature = 0.15, inherit.aes = TRUE) +
  scale_size_continuous(range=c(1,30)) +
  coord_cartesian()

#### simple map
unitedStates <- ggplot(data = states) +
geom_polygon(aes(x = long, y = lat, fill = region, group = group), color = "white") + 
coord_fixed(1.3) +
  geom_point(data = q2,
             aes(x = lonO, y = latO),
             colour = "red", alpha = 0.2) +
  geom_point(data = q2,
             aes(x = lonD, y = latD),
             colour = "red", alpha = 0.2)+
  geom_curve(data = q2,
             aes(x = lonO, y = latO, xend = lonD, yend = latD),    
             arrow = arrow(angle = 15, ends = "first", length = unit(0.5, "cm"), type = "closed"),
             alpha = 0.7, curvature = 0.15, size = q2$scale, inherit.aes = TRUE) +
  scale_size_continuous(range=c(1,30)) +
  coord_cartesian() +
  guides(fill=FALSE) +
  geom_label_repel(data = q2Sliced,
                   aes(x= lonO, y = latO, label = q2Sliced$origin, size = 2)
                  , min.segment.length = 0, size = 3.5) +
  ggtitle("Busiest US Routes in 2002")
unitedStates

  1. LAX:

    (a). Reproduce above plot. Visualize and explain some prominent features you observe. For example, what happened at points 1-5?

#(a). Reproduce above plot. Visualize and explain some prominent features you observe. For example, what happened at points 1-5?
laxFlights <-flights_tbl %>%
  filter(origin == "LAX" | dest == "LAX") %>%
  select(origin, dayofmonth, month, year) %>%
  collect()

#Subset Between 2008 and 1998 Years of LAX flights.
laxFlightDate <- laxFlights %>%
  mutate(date = make_date(year, month, dayofmonth)) %>%
  filter(date >= ymd(19980101) & date <= ymd(20081231)) %>%
  collect()

# Create compressed RDS file.
#write_rds(laxFlightDate, "hw4/laxflightdates.rds")
# Plot

laxFlightDate %>%
  group_by(date) %>%
  count() %>%
  ggplot(aes(x = date, y = n)) +
  geom_line() +
  geom_label(aes(x = ymd(20011230), y = 1110, label = "1")) +
  geom_label(aes(x = ymd(20041201), y = 1000, label = "2")) +
  geom_label(aes(x = ymd(20040630), y = 950, label = "3")) +
  geom_label(aes(x = ymd(20080101), y = 1300, label = "4")) +
  geom_label(aes(x = ymd(20010130), y = 1150, label = "5")) +
  ggtitle("LAX air traffic") +
  coord_fixed()

 # Visualize prominent features.

# Point 1
laxFlightDate  %>%
  group_by(date) %>%
  filter(date >= ymd(20010901) & date <= ymd(20011201)) %>%
  group_by(date) %>%
  count() %>%
  arrange(n) %>%
  collect() %>%
  ggplot(aes(x = date, y = n)) +
  geom_line() +
  ggtitle("Point 1")

# Point 2
laxFlightDate  %>%
  group_by(date) %>%
  filter(date >= ymd(20040731) & date <= ymd(20050101)) %>%
  group_by(date) %>%
  count() %>%
  arrange(n) %>%
  collect() %>%
  ggplot(aes(x = date, y = n)) +
  geom_line() +
  ggtitle("Point 2")

# Point 3
laxFlightDate  %>%
  group_by(date) %>%
  filter(date >= ymd(20040301) & date <= ymd(20041101)) %>%
  group_by(date) %>%
  count() %>%
  arrange(n) %>%
  collect() %>%
  ggplot(aes(x = date, y = n)) +
  geom_line() +
  ggtitle("Point 3")

# Point 4
laxFlightDate  %>%
  group_by(date) %>%
  filter(date >= ymd(20070928) & date <= ymd(20080130)) %>%
  group_by(date) %>%
  count() %>%
  arrange(n) %>%
  collect() %>%
  ggplot(aes(x = date, y = n)) +
  geom_line() +
  ggtitle("Point 4")

# Point 5
laxFlightDate  %>%
  group_by(date) %>%
  filter(date >= ymd(20001115) & date <= ymd(20010225)) %>%
  group_by(date) %>%
  count() %>%
  arrange(n) %>%
  collect() %>%
  ggplot(aes(x = date, y = n)) +
  geom_line() +
  ggtitle("Point 5")

(b). Visualize and explain seasonal effects.
flightsBySeason <-flights_tbl %>%
  select(origin, dest, month, year) %>%
  filter(year >= 1998 && year <= 2008) %>%
  filter(dest == "LAX" | origin == "LAX") %>%
  mutate(season = if_else(month %in% c(3, 4, 5), "spring",
                          if_else(month %in% c(6, 7, 8), "summer",
                                  if_else(month %in% c(9,10, 11), "fall",
                                          if_else(month %in% c(12, 1, 2), "winter", "")
                                          )
                                  )
                          )
        ) %>%
  #group_by(year, season) %>%
  #tally() %>%
  count(year, season) %>%
  arrange(year) %>%
  collect()

ggplot(flightsBySeason) +
  geom_col(aes(x = as.factor(year), y = n, fill = season),
           color = "black") +
  labs(x = "Year", y = "Number of Flights", title = "LAX Air Traffic by Season") +
  scale_fill_brewer(palette = "Pastel2")

(c). Visualize and explain weekly effects.
flightsByWeekday <- flights_tbl %>%
  select(origin, dest, year, dayofweek) %>%
  filter(year >= 1998 && year <= 2008) %>%
  filter(dest == "LAX" | origin == "LAX") %>%
  count(year, dayofweek) %>%
  collect()

flightsByWeekday$dayofweek<-as.factor(flightsByWeekday$dayofweek)
ggplot(flightsByWeekday) +
  geom_col(aes(x = as.factor(year), y = n, fill = dayofweek),
           color = "black") +
  labs(x = "Year", y = "Number of Flights", title = "LAX Air Traffic by Day of Week") +
  scale_fill_brewer(palette = "Pastel2")

(d). Map top 10 destinations from LAX. Size of dots should reflect the number of flights from LAX to that destination.
# Filter the routes by destination: LAX.
topRoutes <-flights_tbl %>%
  select(origin, dest) %>%
  filter(origin == "LAX") %>%
  collect() %>% 
  count(dest) %>%
  arrange(desc(n)) %>%
  head(10)

# Location Retrieval.
destLAX <- topRoutes %>%
  left_join(airports_tbl, by = c("dest" = "faa"), copy = TRUE) %>%
  select(dest, lon, lat, n) %>%
  mutate(lat = as.double(lat),
         lon = as.double(lon)
         ) %>%
  mutate(airport = dest, proportion = n / sum(n))

ggmap(map) +
  geom_point(data = destLAX,
             aes(x = lon, y = lat, size = proportion),
             color = "black") +
  geom_label_repel(data = destLAX,
                   aes(x= lon, y = lat, label =  dest, size = proportion
                   ), min.segment.length = 0, size = 3.5) +
  labs(title = "Top 10 Locations from LAX")

0. Build a predictive model for the arrival delay (arrdelay) of flights flying from LAX. Use the same filtering criteria as in the lecture notes to construct training and validation sets. You are allowed to use a maximum of 5 predictors. The prediction performance of your model on the validation data set will be an important factor for grading this question.

# Select data from an out of time sample
system.time(
modelFlights <- flights_tbl %>%
    filter(origin == "LAX") %>%
    filter(!is.na(arrdelay) & !is.na(depdelay) & !is.na(distance)) %>%
    filter(depdelay > 15 & depdelay < 240) %>%
    filter(arrdelay > -60 & arrdelay < 360) %>%
    filter(year >= 2003 & year <= 2007) %>%
    mutate(weekDay = as.character(dayofweek)) %>%
    select(year, month, arrdelay, depdelay, distance, weekDay)
)
##    user  system elapsed 
##   0.004   0.000   0.006
sampleSplit <- modelFlights %>%
      sdf_partition(train = 0.8, valid = 0.2, seed= 250)

mlr <- sampleSplit$train %>%
  ml_linear_regression(arrdelay ~ distance + depdelay + weekDay)
summary(mlr)
## Call: ml_linear_regression.tbl_spark(., arrdelay ~ distance + depdelay + weekDay)  
## 
## Deviance Residuals (approximate):
##      Min       1Q   Median       3Q      Max 
## -193.070   -7.181   -1.515    5.246  334.094 
## 
## Coefficients:
## (Intercept)    distance    depdelay   weekDay_5   weekDay_7   weekDay_4 
## -1.55690020 -0.00176854  1.01481246  0.62609083  0.35561858  0.85893367 
##   weekDay_1   weekDay_3   weekDay_6 
##  0.67183093  0.52893317 -1.22533557 
## 
## R-Squared: 0.8828
## Root Mean Squared Error: 13.48
# Apply MLR to LAX flight data in 2018.
system.time(
LAX2008 <-flights_tbl %>%
  filter(origin == "LAX") %>%
  filter(!is.na(arrdelay) & !is.na(depdelay) & !is.na(distance)) %>%
  filter(depdelay > 15 & depdelay < 240) %>%
  filter(arrdelay > -60 & arrdelay < 360) %>%
  filter(year == 2008) %>%
  mutate(weekDay = as.character(dayofweek)) %>%
  select(year, month, arrdelay, depdelay, distance, weekDay)
)
##    user  system elapsed 
##   0.004   0.000   0.027
#Determine prediction model.

predictionVal2008 <- sdf_predict(LAX2008, mlr) %>%
  mutate(squareRes = (prediction - arrdelay)^2) %>%
  collect

# Calculate Root Mean Squared Error.
rootmse <- sqrt(sum(predictionVal2008$squareRes)/count(predictionVal2008))
rootmse
##          n
## 1 14.27216
# Visualize actual arrival delay against predicted arrival delay.

PredictionData <- predictionVal2008 %>%
    group_by(weekDay) %>%
    summarize(actual = mean(arrdelay), prediction = mean(prediction), freq = n())

  # Plot
  ggplot(PredictionData, aes(actual, prediction)) + 
  geom_point(alpha = 0.75, color = 'red', shape = 3) +
  geom_abline(intercept = 0, slope = 1, alpha = 0.15, color = 'blue') +
  geom_text(aes(label = substr(weekDay, 1, 20)), size = 3, alpha = 0.75, vjust = -1) +
  labs(title='Actual arrival Delay on Predicted Delay', x = 'Actual', y = 'Predicted')

  1. Visualize and explain any other information you want to explore.
  #Map the top 10 Origins to SFO in 1991.
  
  topSF <-flights_tbl %>%
    filter(year == 1991 & dest == "SFO") %>%
    group_by(origin, dest) %>%
    summarize(n = n()) %>%
    arrange(desc(n)) %>%
    collect() %>%
    head(10)
  
  # Retrieve latitude and longitide of origin and dest
  
  topTenSF <- topSF %>%
    inner_join(airports_tbl, by = c("origin" = "faa"), copy = TRUE) %>%
    select(origin, dest, lat, lon, name,n) %>%
    mutate(latO = as.double(lat),
           lonO = as.double(lon)) %>%
    collect()
  
  usmap <- map_data("usa")
  states <- map_data("state")
  SFOflights <- ggplot(data = states) +
    geom_polygon(aes(x = long, y = lat, fill = region, group = group), color = "white") + 
    coord_fixed(1.3) +
    geom_point(data = topTenSF,
               aes(x = lonO, y = latO, size = n),
               colour = "red", alpha = 0.3) +
    geom_point(data = topTenSF,
               aes(x = -122.375, y = 37.61899948120117),
               colour = "red", alpha = 0.2)+
    geom_curve(data = topTenSF,
               aes(x = lonO, y = latO, xend = -122.375, yend = 37.61899948120117),    
               arrow = arrow(angle = 15, length = unit(0.5, "cm"), type = "closed"),
               alpha = 0.7, inherit.aes = TRUE) +
    scale_size_continuous(range=c(1,30)) +
    coord_cartesian() +
    guides(fill = FALSE) +
    ggtitle("Top 10 Flight Origins to SFO in 1991")
  SFOflights